Data Cleaning

data <- read.csv(file = 'final_dataset2.csv')
data = data[data[,"G"] >= 30,]      #screen out less statistically significant data points
data = data[data[,"Age"] >= 22,]    #salary cap 
data_reduced = data[,c('X','Player', 'Pos', 'salary','Age','FG.','FT.','MPG','PPG','APG','RPG','TOPG','BPG','SPG')]
data_reduced = data_reduced[-c(which(data_reduced$Pos == "SF-SG")),]

Split into train and test 80% and 20%

set.seed(664)
shuffle = sample.int(nrow(data_reduced))
data_reduced$split = cut(shuffle, breaks = 5, labels = c("1","2","3","4","test"))
train = subset(data_reduced, split != "test", select = -split)
test = subset(data_reduced, split == "test", select = -split)
data_reduced = subset(data_reduced, select = -split)
par(mfrow=c(1,3))
hist(train$salary,freq = F, main = "Histogram of Salary", xlab = "Salary")
hist(log(train$salary),freq = F, main = "Histogram of Log of Salary", xlab = "Log of Salary")
hist(sqrt(train$salary),freq = F, main = "Histogram of Square Root of Salary", xlab = "Square Root of Salary of Salary")

model1 = lm(salary~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model1)
## 
## Call:
## lm(formula = salary ~ Age + FG. + FT. + MPG + PPG + APG + RPG + 
##     TOPG + BPG + SPG, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13227874  -3272493   -340406   2870022  15291207 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -16584115    5072428  -3.269  0.00124 ** 
## Age            618222      95639   6.464 5.79e-10 ***
## FG.           4734316    6456505   0.733  0.46413    
## FT.          -6692194    4028580  -1.661  0.09801 .  
## MPG            113349     105518   1.074  0.28382    
## PPG            697890     153702   4.541 8.95e-06 ***
## APG            195999     384221   0.510  0.61044    
## RPG            446132     246548   1.810  0.07164 .  
## TOPG          -606715    1178014  -0.515  0.60701    
## BPG           -273372    1234923  -0.221  0.82500    
## SPG             12901    1316771   0.010  0.99219    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5427000 on 236 degrees of freedom
## Multiple R-squared:  0.5351, Adjusted R-squared:  0.5154 
## F-statistic: 27.16 on 10 and 236 DF,  p-value: < 2.2e-16
plot(model1)

model2 = lm(log(salary)~ Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model2)
## 
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + MPG + PPG + APG + 
##     RPG + TOPG + BPG + SPG, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4695 -0.4710  0.0390  0.5645  1.8255 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.87523    0.76225  12.955  < 2e-16 ***
## Age          0.10782    0.01437   7.502 1.27e-12 ***
## FG.          1.91347    0.97024   1.972 0.049759 *  
## FT.         -0.03295    0.60539  -0.054 0.956642    
## MPG          0.05465    0.01586   3.447 0.000672 ***
## PPG          0.03611    0.02310   1.563 0.119324    
## APG         -0.12428    0.05774  -2.152 0.032380 *  
## RPG          0.02254    0.03705   0.608 0.543603    
## TOPG         0.21732    0.17702   1.228 0.220816    
## BPG         -0.02968    0.18558  -0.160 0.873089    
## SPG          0.08239    0.19788   0.416 0.677507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8155 on 236 degrees of freedom
## Multiple R-squared:  0.5051, Adjusted R-squared:  0.4842 
## F-statistic: 24.09 on 10 and 236 DF,  p-value: < 2.2e-16
plot(model2)

model3 = lm(sqrt(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model3)
## 
## Call:
## lm(formula = sqrt(salary) ~ Age + FG. + FT. + MPG + PPG + APG + 
##     RPG + TOPG + BPG + SPG, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2191.24  -592.62    -5.53   559.79  2364.94 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2822.33     847.76  -3.329  0.00101 ** 
## Age           117.59      15.98   7.356 3.11e-12 ***
## FG.          1470.94    1079.09   1.363  0.17414    
## FT.          -683.56     673.30  -1.015  0.31103    
## MPG            44.87      17.64   2.544  0.01159 *  
## PPG            81.96      25.69   3.191  0.00161 ** 
## APG           -55.33      64.22  -0.862  0.38973    
## RPG            53.67      41.21   1.303  0.19400    
## TOPG           67.27     196.88   0.342  0.73290    
## BPG           -49.00     206.39  -0.237  0.81256    
## SPG            39.13     220.07   0.178  0.85901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 907 on 236 degrees of freedom
## Multiple R-squared:  0.5447, Adjusted R-squared:  0.5254 
## F-statistic: 28.24 on 10 and 236 DF,  p-value: < 2.2e-16
plot(model3)

Looking into different positions

train_PG =  train[train[,3] == "PG", ]
train_PF =  train[train[,3] == "PF", ]
train_SF =  train[train[,3] == "SF", ]
train_C = train[train[,3] == "C", ]
train_SG = train[train[,3] == "SG", ]

test_PG =  test[test[,3] == "PG", ]
test_PF =  test[test[,3] == "PF", ]
test_SF =  test[test[,3] == "SF", ]
test_C = test[test[,3] == "C", ]
test_SG = test[test[,3] == "SG", ]

First for the PG group,

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(train_PG[,-c(1,2,3)]), method = 'number')

par(mfrow=c(2,3))
hist(train_PG$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_PG$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_PG$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_PG$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_PG$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_PG$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")

hist(train_PG$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_PG$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_PG$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_PG$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")

lm1 = lm(log(train_PG$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_PG)
summary(lm1)
## 
## Call:
## lm(formula = log(train_PG$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train_PG)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2274 -0.3493  0.0932  0.4942  1.6099 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  7.158469   2.865079   2.499  0.01691 * 
## Age          0.143467   0.042763   3.355  0.00181 **
## FG.         -0.028013   4.246212  -0.007  0.99477   
## FT.          2.315910   2.428617   0.954  0.34632   
## MPG          0.056338   0.055631   1.013  0.31760   
## PPG          0.021056   0.058364   0.361  0.72026   
## APG         -0.098764   0.163019  -0.606  0.54822   
## RPG          0.005885   0.174597   0.034  0.97329   
## TOPG         0.265868   0.402714   0.660  0.51311   
## BPG          0.721408   1.032994   0.698  0.48920   
## SPG          0.486642   0.587876   0.828  0.41295   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9683 on 38 degrees of freedom
## Multiple R-squared:  0.5554, Adjusted R-squared:  0.4384 
## F-statistic: 4.747 on 10 and 38 DF,  p-value: 0.0002036
plot(lm1)

library(car)
## Loading required package: carData
outlierTest(lm1)
##      rstudent unadjusted p-value Bonferroni p
## 314 -4.227194         0.00014859    0.0072807
## 355 -3.960325         0.00032764    0.0160540
influencePlot(lm1)
train2 = subset(train_PG,train_PG$X != 218 &train_PG$X != 314 & train_PG$X != 355 &train_PG$X != 376)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
## 
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19413 -0.40517  0.04323  0.39876  1.25242 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  7.14791    2.11457   3.380  0.00183 **
## Age          0.10219    0.03122   3.273  0.00245 **
## FG.          1.88851    3.21857   0.587  0.56124   
## FT.          3.29799    1.76777   1.866  0.07074 . 
## MPG          0.06323    0.04253   1.487  0.14626   
## PPG          0.02002    0.04170   0.480  0.63427   
## APG         -0.14174    0.12321  -1.150  0.25801   
## RPG          0.09181    0.15781   0.582  0.56457   
## TOPG         0.18058    0.29576   0.611  0.54555   
## BPG          0.11713    1.08238   0.108  0.91446   
## SPG          0.17159    0.46036   0.373  0.71167   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6609 on 34 degrees of freedom
## Multiple R-squared:  0.6704, Adjusted R-squared:  0.5735 
## F-statistic: 6.916 on 10 and 34 DF,  p-value: 8.64e-06
plot(lm2)

#AIC and Mallow's Cp select model yields the same model 
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)

rs$which[which.min(rs$cp),]
## (Intercept)         Age         FG.         FT.         MPG         PPG 
##        TRUE        TRUE       FALSE        TRUE        TRUE       FALSE 
##         APG         RPG        TOPG         BPG         SPG 
##       FALSE       FALSE       FALSE       FALSE       FALSE
#step(lm2)
lm3 = lm(log(salary)~Age+FT.+MPG,train2)
summary(lm3)
## 
## Call:
## lm(formula = log(salary) ~ Age + FT. + MPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48115 -0.41848  0.08693  0.51650  1.05139 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.70850    1.14506   6.732 3.94e-08 ***
## Age          0.09637    0.02621   3.677 0.000679 ***
## FT.          3.50883    1.29060   2.719 0.009562 ** 
## MPG          0.08850    0.01515   5.841 7.28e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6401 on 41 degrees of freedom
## Multiple R-squared:  0.6272, Adjusted R-squared:  0.5999 
## F-statistic: 22.99 on 3 and 41 DF,  p-value: 6.868e-09
plot(lm3)

#no sign of heteroscedasticity
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(lm3, ~ Age*FT.*MPG + I(Age^2) + I(FT.^2) + I(MPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm3
## BP = 3.6272, df = 10, p-value = 0.9626
#not transform a low p-value of heteroscedasticity
lm4 = lm(salary~Age+FT.+MPG,train2)
bptest(lm4, ~ Age*FT.*MPG + I(Age^2) + I(FT.^2) + I(MPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm4
## BP = 13.964, df = 10, p-value = 0.1746
#no sign of multicollinearity 
library(car)
vif(lm3)
##      Age      FT.      MPG 
## 1.041019 1.095500 1.057356

Now, remove the MPG, and only consider the skills of the player that might affect his salary.

b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~Age+FT.+PPG + RPG,train2)
summary(lm6)
## 
## Call:
## lm(formula = log(salary) ~ Age + FT. + PPG + RPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29594 -0.38783  0.00852  0.33224  1.31336 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.50279    1.20550   7.053 1.58e-08 ***
## Age          0.09896    0.02674   3.700 0.000648 ***
## FT.          3.47504    1.37690   2.524 0.015682 *  
## PPG          0.06962    0.02723   2.556 0.014483 *  
## RPG          0.18613    0.12078   1.541 0.131180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.651 on 40 degrees of freedom
## Multiple R-squared:  0.6237, Adjusted R-squared:  0.5861 
## F-statistic: 16.57 on 4 and 40 DF,  p-value: 4.367e-08
#compare rmse for original model and reduced model 
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,test_PG),log(test_PG$salary))
## [1] 0.8903253
rmse(predict(lm3,test_PG),log(test_PG$salary))
## [1] 0.8233036
rmse(predict(lm6,test_PG),log(test_PG$salary))
## [1] 0.7772102
predict(lm6, test_PG, interval="predict")
##          fit      lwr      upr
## 74  17.05761 15.59015 18.52508
## 95  14.95173 13.59520 16.30827
## 100 15.75783 14.35197 17.16368
## 286 15.55322 14.19188 16.91456
## 359 14.09199 12.72001 15.46397
## 360 15.44492 14.08005 16.80979
## 362 15.61179 14.26325 16.96033
## 385 15.03177 13.68666 16.37689
## 410 14.41061 13.04970 15.77152
## 417 15.27077 13.83427 16.70727
## 432 15.28954 13.94222 16.63685

Second for the C group,

library(corrplot)
corrplot(cor(train_C[,-c(1,2,3)]), method = 'number')

par(mfrow=c(2,3))
hist(train_C$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_C$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_C$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_C$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_C$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_C$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")

hist(train_C$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_C$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_C$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_C$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")

lm1 = lm(log(train_C$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_C)
summary(lm1)
## 
## Call:
## lm(formula = log(train_C$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train_C)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53913 -0.47077 -0.04493  0.45163  1.88494 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.015055   2.354968   6.801 1.82e-08 ***
## Age          0.063203   0.029587   2.136  0.03802 *  
## FG.         -4.632594   2.452040  -1.889  0.06517 .  
## FT.         -1.547908   1.351832  -1.145  0.25811    
## MPG          0.120110   0.042434   2.830  0.00687 ** 
## PPG         -0.061823   0.063103  -0.980  0.33235    
## APG         -0.420992   0.168229  -2.502  0.01595 *  
## RPG         -0.012047   0.079125  -0.152  0.87966    
## TOPG         0.520320   0.322899   1.611  0.11393    
## BPG         -0.345789   0.317512  -1.089  0.28180    
## SPG         -0.009676   0.509826  -0.019  0.98494    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7656 on 46 degrees of freedom
## Multiple R-squared:  0.5192, Adjusted R-squared:  0.4147 
## F-statistic: 4.968 on 10 and 46 DF,  p-value: 7.24e-05
plot(lm1)

library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 309 2.902362          0.0057165      0.32584
influencePlot(lm1)
train2 = subset(train_C, train_C$X != 77 & train_C$X != 97 & train_C$X != 112 & train_C$X != 309 & train_C$X != 333)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
## 
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.40905 -0.40228 -0.01136  0.43361  1.16496 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.18677    2.05173   7.889 9.43e-10 ***
## Age          0.07632    0.02913   2.620  0.01228 *  
## FG.         -5.90716    2.12730  -2.777  0.00824 ** 
## FT.         -1.76820    1.18598  -1.491  0.14364    
## MPG          0.12486    0.03766   3.316  0.00192 ** 
## PPG         -0.06628    0.05660  -1.171  0.24828    
## APG         -0.34769    0.17556  -1.980  0.05439 .  
## RPG         -0.01914    0.06991  -0.274  0.78559    
## TOPG         0.49642    0.33078   1.501  0.14109    
## BPG         -0.28894    0.28094  -1.028  0.30976    
## SPG          0.43788    0.48539   0.902  0.37226    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6535 on 41 degrees of freedom
## Multiple R-squared:  0.6379, Adjusted R-squared:  0.5496 
## F-statistic: 7.224 on 10 and 41 DF,  p-value: 1.931e-06
plot(lm2)

#AIC and Mallow's Cp select model yields the same model 
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)

rs$which[which.min(rs$cp),]
## (Intercept)         Age         FG.         FT.         MPG         PPG 
##        TRUE        TRUE        TRUE        TRUE        TRUE       FALSE 
##         APG         RPG        TOPG         BPG         SPG 
##        TRUE       FALSE       FALSE       FALSE       FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + FG. + FT.+ MPG + APG,train2)
summary(lm3)
## 
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + MPG + APG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.44783 -0.45007 -0.04617  0.41905  1.39482 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.42451    1.95793   8.389 7.96e-11 ***
## Age          0.07952    0.02570   3.094  0.00336 ** 
## FG.         -5.72859    2.05632  -2.786  0.00773 ** 
## FT.         -2.19521    1.04744  -2.096  0.04163 *  
## MPG          0.10607    0.01719   6.172 1.60e-07 ***
## APG         -0.25550    0.14768  -1.730  0.09032 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6454 on 46 degrees of freedom
## Multiple R-squared:  0.6038, Adjusted R-squared:  0.5607 
## F-statistic: 14.02 on 5 and 46 DF,  p-value: 2.48e-08
plot(lm3)

#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * FG. * FT.* MPG * APG + I(Age^2) + I(FG.^2) + I(FT.^2) + I(MPG^2) + I(APG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm3
## BP = 20.855, df = 36, p-value = 0.9794
#not transform leads to a low p-value of heteroscedasticity
lm4 = lm(salary~Age + FG. + FT.+ MPG + APG,train2)
bptest(lm4, ~ Age * FG. * FT.* MPG * APG + I(Age^2) + I(FG.^2) + I(FT.^2) + I(MPG^2) + I(APG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm4
## BP = 32.774, df = 36, p-value = 0.6228
#no sign of multicollinearity 
library(car)
vif(lm3)
##      Age      FG.      FT.      MPG      APG 
## 1.113804 1.754544 1.648388 1.831585 2.041956

Now, remove the MPG, and only consider the skills of the player that might affect his salary.

b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age+FG.+FT.+RPG,train2)
summary(lm6)
## 
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + RPG, data = train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6734 -0.4292 -0.0923  0.5551  1.1445 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.45252    2.09953   8.789 1.74e-11 ***
## Age          0.06505    0.02806   2.318  0.02485 *  
## FG.         -6.86474    2.21839  -3.094  0.00332 ** 
## FT.         -2.72837    1.13792  -2.398  0.02052 *  
## RPG          0.18506    0.03400   5.443 1.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7134 on 47 degrees of freedom
## Multiple R-squared:  0.5053, Adjusted R-squared:  0.4632 
## F-statistic:    12 on 4 and 47 DF,  p-value: 8.44e-07
#compare rmse for original model and reduced model 
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,test_C),log(test_C$salary))
## [1] 0.9035814
rmse(predict(lm3,test_C),log(test_C$salary))
## [1] 0.9377931
rmse(predict(lm6,test_C),log(test_C$salary))
## [1] 1.03725
predict(lm6, test_C, interval="predict")
##          fit      lwr      upr
## 5   15.39580 13.89035 16.90125
## 15  17.69028 16.05980 19.32075
## 62  15.95068 14.42018 17.48118
## 63  15.37115 13.90986 16.83244
## 72  14.71429 13.15701 16.27157
## 150 15.71457 14.23488 17.19426
## 215 15.43631 13.94066 16.93196
## 245 16.42167 14.89485 17.94848
## 271 16.31635 14.81985 17.81286
## 290 16.94133 15.40278 18.47989
## 301 16.18228 14.57069 17.79386
## 315 15.29552 13.76339 16.82765
## 375 15.93430 14.42219 17.44641
## 411 15.12408 13.64060 16.60756
## 450 16.17169 14.65599 17.68739

Second for the SG group,

library(corrplot)
corrplot(cor(train_SG[,-c(1,2,3)]), method = 'number')

par(mfrow=c(2,3))
hist(train_SG$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_SG$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_SG$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_SG$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_SG$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_SG$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")

hist(train_SG$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_SG$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_SG$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_SG$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")

lm1 = lm(log(train_SG$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_SG)
summary(lm1)
## 
## Call:
## lm(formula = log(train_SG$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train_SG)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83255 -0.44192  0.01654  0.52603  1.52688 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.47292    1.96487   5.839 5.82e-07 ***
## Age          0.04549    0.03037   1.498  0.14130    
## FG.          0.54106    3.28136   0.165  0.86979    
## FT.          0.47179    1.12032   0.421  0.67572    
## MPG          0.07770    0.02846   2.730  0.00907 ** 
## PPG          0.02358    0.04475   0.527  0.60098    
## APG          0.07345    0.14661   0.501  0.61886    
## RPG         -0.06213    0.13715  -0.453  0.65277    
## TOPG        -0.07060    0.36555  -0.193  0.84775    
## BPG          0.46570    0.60771   0.766  0.44758    
## SPG          0.06410    0.41413   0.155  0.87770    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7221 on 44 degrees of freedom
## Multiple R-squared:  0.646,  Adjusted R-squared:  0.5655 
## F-statistic: 8.028 on 10 and 44 DF,  p-value: 3.443e-07
plot(lm1)

library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferroni p
## 145 -3.076338          0.0036374      0.20006
influencePlot(lm1)
train2 = subset(train_SG, train_SG$X != 8 & train_SG$X != 17 & train_SG$X != 79 & train_SG$X != 145)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
## 
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14855 -0.31580 -0.02903  0.41578  1.09627 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.94774    1.75651   6.802 3.54e-08 ***
## Age          0.06125    0.02761   2.218  0.03226 *  
## FG.         -1.65592    3.11812  -0.531  0.59831    
## FT.          0.46095    1.08641   0.424  0.67363    
## MPG          0.07364    0.02648   2.781  0.00821 ** 
## PPG          0.05902    0.04112   1.435  0.15898    
## APG          0.06021    0.13363   0.451  0.65475    
## RPG         -0.04417    0.12773  -0.346  0.73132    
## TOPG        -0.22281    0.33945  -0.656  0.51534    
## BPG          0.52355    0.78376   0.668  0.50797    
## SPG         -0.03669    0.37773  -0.097  0.92310    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6376 on 40 degrees of freedom
## Multiple R-squared:  0.7317, Adjusted R-squared:  0.6647 
## F-statistic: 10.91 on 10 and 40 DF,  p-value: 1.219e-08
plot(lm2)

#AIC and Mallow's Cp select model yields the same model 
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)

rs$which[which.min(rs$cp),]
## (Intercept)         Age         FG.         FT.         MPG         PPG 
##        TRUE        TRUE       FALSE       FALSE        TRUE        TRUE 
##         APG         RPG        TOPG         BPG         SPG 
##       FALSE       FALSE       FALSE       FALSE       FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + MPG + PPG,train2)
summary(lm3)
## 
## Call:
## lm(formula = log(salary) ~ Age + MPG + PPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.18382 -0.38717 -0.05981  0.42089  1.20669 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.38183    0.67270  16.920  < 2e-16 ***
## Age          0.06997    0.02354   2.973  0.00464 ** 
## MPG          0.07055    0.02137   3.301  0.00184 ** 
## PPG          0.04798    0.02631   1.823  0.07460 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5976 on 47 degrees of freedom
## Multiple R-squared:  0.7231, Adjusted R-squared:  0.7054 
## F-statistic: 40.91 on 3 and 47 DF,  p-value: 3.743e-13
plot(lm3)

#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * MPG * PPG + I(Age^2) + I(MPG^2) + I(PPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm3
## BP = 8.1532, df = 10, p-value = 0.6139
#not transform leads to a low p-value of heteroscedasticity
lm4 = lm(salary~Age +  MPG + PPG,train2)
bptest(lm4, ~ Age * MPG * PPG + I(Age^2) + I(MPG^2) + I(PPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm4
## BP = 15.514, df = 10, p-value = 0.1144
#not a sign of multicollinearity 
library(car)
vif(lm3)
##      Age      MPG      PPG 
## 1.020484 4.258776 4.235640

Now, remove the MPG, and only consider the skills of the player that might affect his salary.

b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age + PPG,train2)
summary(lm6)
## 
## Call:
## lm(formula = log(salary) ~ Age + PPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47386 -0.44682 -0.03799  0.35105  1.20251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.09741    0.69938  17.297  < 2e-16 ***
## Age          0.07571    0.02578   2.937  0.00508 ** 
## PPG          0.12372    0.01415   8.746 1.68e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6563 on 48 degrees of freedom
## Multiple R-squared:  0.6589, Adjusted R-squared:  0.6447 
## F-statistic: 46.36 on 2 and 48 DF,  p-value: 6.164e-12
#compare rmse for original model and reduced model 
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_SG),log(train_SG$salary))
## [1] 0.6588942
rmse(predict(lm3,train_SG),log(train_SG$salary))
## [1] 0.6575775
rmse(predict(lm6,train_SG),log(train_SG$salary))
## [1] 0.7322491
predict(lm6, train_SG, interval="predict")
##          fit      lwr      upr
## 8   15.02092 13.68493 16.35691
## 9   14.49684 13.14663 15.84704
## 13  15.62335 14.28466 16.96204
## 17  14.68138 13.33830 16.02446
## 26  14.93808 13.55885 16.31730
## 28  15.90603 14.56997 17.24208
## 30  14.84685 13.50354 16.19015
## 37  15.44475 14.10786 16.78163
## 39  16.71635 15.32909 18.10360
## 43  14.35012 12.99581 15.70443
## 48  14.77284 13.42847 16.11720
## 49  15.65892 14.31971 16.99812
## 67  16.00651 14.64900 17.36401
## 76  14.35056 12.99241 15.70872
## 79  15.42927 14.08501 16.77352
## 86  14.97116 13.63298 16.30935
## 92  14.50853 13.14460 15.87247
## 96  17.06295 15.68685 18.43906
## 101 15.17262 13.83142 16.51381
## 111 15.83516 14.49763 17.17268
## 119 14.17161 12.80257 15.54065
## 123 16.23571 14.82380 17.64762
## 124 15.75936 14.42518 17.09354
## 131 16.52186 15.17159 17.87212
## 145 16.01997 14.66245 17.37748
## 168 16.71333 15.33787 18.08880
## 169 15.54402 14.18250 16.90554
## 180 17.98227 16.53942 19.42511
## 191 15.52863 14.00739 17.04987
## 205 15.26699 13.92683 16.60716
## 206 16.95810 15.58775 18.32846
## 208 15.15130 13.80156 16.50103
## 224 15.94229 14.60620 17.27839
## 226 15.70705 14.36708 17.04703
## 241 15.72264 14.38869 17.05660
## 252 15.80861 14.47405 17.14318
## 253 15.57305 14.22873 16.91737
## 259 16.61777 15.25998 17.97557
## 272 15.28382 13.95051 16.61712
## 274 14.83387 13.49453 16.17321
## 278 17.23528 15.85194 18.61862
## 293 14.01261 12.64281 15.38241
## 295 15.09893 13.75091 16.44695
## 328 15.41875 14.05472 16.78278
## 329 15.72361 14.38695 17.06027
## 337 14.59499 13.24720 15.94278
## 344 14.65561 13.31166 15.99956
## 368 15.80893 14.46703 17.15084
## 390 14.31674 12.95762 15.67587
## 396 14.26027 12.89380 15.62674
## 414 14.95403 13.61700 16.29106
## 418 14.91715 13.57947 16.25483
## 420 14.45065 13.09917 15.80212
## 427 15.16691 13.83257 16.50124
## 441 16.85231 15.46780 18.23682

Fourth for the PF group,

library(corrplot)
corrplot(cor(train_PF[,-c(1,2,3)]), method = 'number')

par(mfrow=c(2,3))
hist(train_PF$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_PF$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_PF$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_PF$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_PF$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_PF$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")

hist(train_PF$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_PF$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_PF$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_PF$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")

lm1 = lm(log(train_PF$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_PF)
summary(lm1)
## 
## Call:
## lm(formula = log(train_PF$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train_PF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26801 -0.43526 -0.09354  0.47326  1.17178 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.417567   1.371366   7.596 3.30e-09 ***
## Age          0.138161   0.029515   4.681 3.41e-05 ***
## FG.         -1.294810   2.163701  -0.598   0.5530    
## FT.          0.636577   1.065327   0.598   0.5536    
## MPG         -0.001643   0.038856  -0.042   0.9665    
## PPG          0.076128   0.046796   1.627   0.1118    
## APG          0.289919   0.170015   1.705   0.0961 .  
## RPG          0.234687   0.110705   2.120   0.0404 *  
## TOPG        -0.813774   0.529994  -1.535   0.1327    
## BPG          0.038748   0.291752   0.133   0.8950    
## SPG         -0.018530   0.497542  -0.037   0.9705    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6726 on 39 degrees of freedom
## Multiple R-squared:  0.681,  Adjusted R-squared:  0.5992 
## F-statistic: 8.325 on 10 and 39 DF,  p-value: 4.839e-07
plot(lm1)

library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferroni p
## 308 -2.064256           0.045869           NA
influencePlot(lm1)
train2 = subset(train_PF, train_PF$X != 118 & train_PF$X != 251 & train_PF$X != 262 & train_PF$X != 276 & train_PF$X != 308 & train_PF$X != 458)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
## 
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0158 -0.3305 -0.1056  0.4281  0.9896 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.98857    1.32042   7.565 1.06e-08 ***
## Age          0.16526    0.03563   4.638 5.33e-05 ***
## FG.         -1.43707    2.12482  -0.676   0.5035    
## FT.          0.49670    1.07943   0.460   0.6484    
## MPG         -0.01472    0.04534  -0.325   0.7475    
## PPG          0.10276    0.05899   1.742   0.0908 .  
## APG          0.33071    0.24684   1.340   0.1895    
## RPG          0.16639    0.11161   1.491   0.1455    
## TOPG        -0.66710    0.57397  -1.162   0.2535    
## BPG          0.20804    0.45248   0.460   0.6487    
## SPG         -0.05157    0.46642  -0.111   0.9126    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.624 on 33 degrees of freedom
## Multiple R-squared:  0.719,  Adjusted R-squared:  0.6338 
## F-statistic: 8.443 on 10 and 33 DF,  p-value: 1.265e-06
plot(lm2)

#AIC: Age + PPG
#and Mallow's Cp: Age + PPG + APG + RPG + TOPG EXISTS MULTICOOLIENEARITY

library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)

rs$which[which.min(rs$cp),]
## (Intercept)         Age         FG.         FT.         MPG         PPG 
##        TRUE        TRUE       FALSE       FALSE       FALSE        TRUE 
##         APG         RPG        TOPG         BPG         SPG 
##       FALSE       FALSE       FALSE       FALSE       FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + PPG,train2)
summary(lm3)
## 
## Call:
## lm(formula = log(salary) ~ Age + PPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31483 -0.35817  0.02036  0.42330  1.02781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.70125    0.74194  13.075 3.24e-16 ***
## Age          0.16959    0.02618   6.477 9.07e-08 ***
## PPG          0.12225    0.01630   7.500 3.28e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5921 on 41 degrees of freedom
## Multiple R-squared:  0.6856, Adjusted R-squared:  0.6703 
## F-statistic: 44.71 on 2 and 41 DF,  p-value: 4.983e-11
plot(lm3)

#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * PPG + I(Age^2) + I(PPG^2) , data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm3
## BP = 4.1702, df = 5, p-value = 0.5252
#not transform
lm4 = lm(salary~Age + PPG ,train2)
bptest(lm4, ~ Age * PPG + I(Age^2) + I(PPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm4
## BP = 5.6517, df = 5, p-value = 0.3416
#no sign of multicollinearity 
library(car)
vif(lm3)
##      Age      PPG 
## 1.009991 1.009991

Now, remove the MPG, and only consider the skills of the player that might affect his salary.

b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age+PPG+APG+RPG+TOPG,train2)
summary(lm6)
## 
## Call:
## lm(formula = log(salary) ~ Age + PPG + APG + RPG + TOPG, data = train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0558 -0.3414 -0.0812  0.3528  0.9883 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.72676    0.74297  13.092 1.17e-15 ***
## Age          0.16139    0.02650   6.090 4.28e-07 ***
## PPG          0.10014    0.03620   2.766  0.00871 ** 
## APG          0.31670    0.20653   1.533  0.13344    
## RPG          0.13813    0.09058   1.525  0.13555    
## TOPG        -0.68436    0.51223  -1.336  0.18948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5893 on 38 degrees of freedom
## Multiple R-squared:  0.7114, Adjusted R-squared:  0.6734 
## F-statistic: 18.73 on 5 and 38 DF,  p-value: 2.36e-09
#compare rmse for original model and reduced model 
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_PF),log(train_PF$salary))
## [1] 0.633717
rmse(predict(lm3,train_PF),log(train_PF$salary))
## [1] 0.6672642
rmse(predict(lm6,train_PF),log(train_PF$salary))
## [1] 0.6296484
predict(lm6, train_PF, interval="predict")
##          fit      lwr      upr
## 2   15.61036 14.34463 16.87609
## 6   15.67198 14.37449 16.96947
## 23  16.06217 14.79191 17.33243
## 33  17.28629 15.85959 18.71299
## 54  17.02343 15.70466 18.34221
## 82  15.34615 14.09219 16.60011
## 88  14.61347 13.36189 15.86505
## 98  16.65687 15.42056 17.89318
## 118 16.56938 14.64690 18.49185
## 132 14.64953 13.36524 15.93381
## 133 16.16674 14.93998 17.39349
## 136 15.15574 13.89765 16.41383
## 147 17.00424 15.65314 18.35533
## 152 13.63947 12.36798 14.91097
## 153 16.06586 14.78669 17.34504
## 170 14.82259 13.57454 16.07064
## 181 16.21979 14.87783 17.56176
## 184 15.79489 14.46497 17.12481
## 185 15.59436 14.29481 16.89391
## 187 14.32602 13.04868 15.60336
## 200 14.59255 13.35651 15.82858
## 221 15.53831 14.29732 16.77929
## 232 14.30994 13.07202 15.54786
## 249 15.39417 14.10383 16.68451
## 251 15.18076 13.94938 16.41214
## 262 15.50669 13.98011 17.03328
## 266 15.09471 13.78960 16.39982
## 273 14.96296 13.72332 16.20259
## 276 18.98504 17.10004 20.87003
## 279 15.42891 14.09046 16.76736
## 294 15.94465 14.71034 17.17896
## 303 16.15104 14.91907 17.38301
## 308 14.84792 13.61770 16.07815
## 325 15.52595 14.30644 16.74546
## 334 16.19328 14.91247 17.47410
## 342 15.95398 14.68440 17.22357
## 343 14.84329 13.54818 16.13840
## 347 14.90394 13.67226 16.13562
## 353 14.85074 13.61870 16.08278
## 377 15.88844 14.64575 17.13113
## 379 14.03916 12.79349 15.28484
## 382 13.86332 12.61043 15.11621
## 383 15.82604 14.56953 17.08256
## 401 16.73463 15.46009 18.00918
## 407 16.16808 14.94582 17.39034
## 413 16.23805 14.91792 17.55818
## 422 15.63290 14.40313 16.86268
## 424 14.75007 13.50228 15.99787
## 429 14.33206 13.07270 15.59143
## 458 17.25709 15.85288 18.66130

Finally for the SF group, need to proceed with caution because data point is few

library(corrplot)
corrplot(cor(train_SF[,-c(1,2,3)]), method = 'number')

par(mfrow=c(2,3))
hist(train_SF$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_SF$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_SF$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_SF$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_SF$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_SF$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")

hist(train_SF$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_SF$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_SF$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_SF$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")

lm1 = lm(log(train_SF$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_SF)
summary(lm1)
## 
## Call:
## lm(formula = log(train_SF$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train_SF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.00452 -0.42213  0.05411  0.44955  1.65939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.81493    2.29050   3.848  0.00073 ***
## Age          0.12539    0.04495   2.790  0.00994 ** 
## FG.          4.33691    3.64171   1.191  0.24488    
## FT.          0.41396    2.24372   0.184  0.85511    
## MPG          0.02813    0.04995   0.563  0.57834    
## PPG          0.18076    0.09295   1.945  0.06315 .  
## APG         -0.37214    0.43300  -0.859  0.39826    
## RPG         -0.28244    0.22314  -1.266  0.21726    
## TOPG        -0.23898    1.04375  -0.229  0.82076    
## BPG          0.52375    0.67757   0.773  0.44678    
## SPG          0.21274    0.57001   0.373  0.71213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9418 on 25 degrees of freedom
## Multiple R-squared:  0.5985, Adjusted R-squared:  0.4379 
## F-statistic: 3.727 on 10 and 25 DF,  p-value: 0.003658
plot(lm1)

library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 264 -3.30469          0.0029776      0.10719
influencePlot(lm1)
train2 = subset(train_SF, train_SF$X != 235 & train_SF$X != 254 & train_SF$X != 264 & train_SF$X != 350)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
## 
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG + 
##     APG + RPG + TOPG + BPG + SPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.23421 -0.42048 -0.03894  0.46638  1.03455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.77632    2.11238   5.575 1.56e-05 ***
## Age          0.10796    0.03827   2.821   0.0102 *  
## FG.          1.65227    3.09157   0.534   0.5986    
## FT.         -1.80401    2.07271  -0.870   0.3939    
## MPG          0.04705    0.04770   0.986   0.3352    
## PPG          0.14591    0.08838   1.651   0.1136    
## APG          0.40514    0.42611   0.951   0.3525    
## RPG         -0.34705    0.20498  -1.693   0.1052    
## TOPG        -0.89746    0.95123  -0.943   0.3562    
## BPG          1.53581    0.91001   1.688   0.1063    
## SPG         -0.19293    0.60743  -0.318   0.7539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.746 on 21 degrees of freedom
## Multiple R-squared:  0.654,  Adjusted R-squared:  0.4892 
## F-statistic: 3.969 on 10 and 21 DF,  p-value: 0.003756
plot(lm2)

#AIC and Mallow's Cp select model yields the same model 
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)

rs$which[which.min(rs$cp),]
## (Intercept)         Age         FG.         FT.         MPG         PPG 
##        TRUE        TRUE       FALSE       FALSE       FALSE        TRUE 
##         APG         RPG        TOPG         BPG         SPG 
##       FALSE        TRUE       FALSE        TRUE       FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + PPG + RPG + BPG,train2)
summary(lm3)
## 
## Call:
## lm(formula = log(salary) ~ Age + PPG + RPG + BPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33417 -0.40081 -0.07458  0.37252  1.41848 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.04769    0.85467  12.926 4.44e-13 ***
## Age          0.12118    0.02936   4.127 0.000316 ***
## PPG          0.16632    0.03905   4.259 0.000222 ***
## RPG         -0.28674    0.15802  -1.815 0.080713 .  
## BPG          1.49306    0.71904   2.076 0.047496 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7114 on 27 degrees of freedom
## Multiple R-squared:  0.5954, Adjusted R-squared:  0.5355 
## F-statistic: 9.933 on 4 and 27 DF,  p-value: 4.475e-05
plot(lm3)

#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * PPG * RPG * BPG + I(Age^2) + I(PPG^2) + I(RPG^2) + I(BPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm3
## BP = 13.562, df = 19, p-value = 0.8086
#not transform 
lm4 = lm(salary~Age + PPG + RPG + BPG,train2)
bptest(lm4, ~ Age * PPG * RPG * BPG + I(Age^2) + I(PPG^2) + I(RPG^2) + I(BPG^2), data = train2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm4
## BP = 11.547, df = 19, p-value = 0.9041
#no sign of multicollinearity 
library(car)
vif(lm3)
##      Age      PPG      RPG      BPG 
## 1.003207 1.875995 2.842068 1.742353

Now, remove the MPG, and only consider the skills of the player that might affect his salary.

b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age + PPG + RPG + BPG,train2)
summary(lm6)
## 
## Call:
## lm(formula = log(salary) ~ Age + PPG + RPG + BPG, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33417 -0.40081 -0.07458  0.37252  1.41848 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.04769    0.85467  12.926 4.44e-13 ***
## Age          0.12118    0.02936   4.127 0.000316 ***
## PPG          0.16632    0.03905   4.259 0.000222 ***
## RPG         -0.28674    0.15802  -1.815 0.080713 .  
## BPG          1.49306    0.71904   2.076 0.047496 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7114 on 27 degrees of freedom
## Multiple R-squared:  0.5954, Adjusted R-squared:  0.5355 
## F-statistic: 9.933 on 4 and 27 DF,  p-value: 4.475e-05
#compare rmse for original model and reduced model 
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_SF),log(train_SF$salary))
## [1] 0.9533102
#rmse(predict(lm3,train_SF),log(train_SF$salary))
rmse(predict(lm6,train_SF),log(train_SF$salary))
## [1] 0.9587561
predict(lm6, train_SF, interval="predict")
##          fit      lwr      upr
## 3   14.34873 12.81058 15.88689
## 20  16.33546 14.69209 17.97882
## 38  15.99437 14.43598 17.55277
## 50  16.16912 14.62463 17.71361
## 55  13.87185 12.30206 15.44164
## 56  15.61327 14.11485 17.11169
## 83  15.34755 13.84107 16.85403
## 110 14.94586 13.42144 16.47028
## 116 15.07882 13.58516 16.57248
## 134 16.54660 14.90336 18.18984
## 135 15.59063 14.10057 17.08070
## 172 15.33281 13.84230 16.82331
## 174 14.03587 12.43613 15.63560
## 211 15.82829 14.20332 17.45327
## 235 16.47526 14.71924 18.23128
## 240 14.56215 13.05394 16.07037
## 254 19.60119 17.07727 22.12511
## 258 16.43131 14.78497 18.07766
## 263 14.94690 13.32929 16.56452
## 264 14.18436 12.61192 15.75680
## 282 14.96067 13.44800 16.47333
## 297 14.85752 13.34479 16.37024
## 307 15.34209 13.72725 16.95692
## 312 14.96198 13.45674 16.46721
## 341 14.95646 13.42040 16.49251
## 350 17.08479 15.41507 18.75450
## 352 14.22532 12.69711 15.75353
## 366 16.20897 14.60052 17.81741
## 372 15.26702 13.60959 16.92445
## 373 14.14819 12.58938 15.70701
## 412 14.41647 12.86404 15.96889
## 421 15.89806 14.31743 17.47869
## 442 16.85111 15.11270 18.58951
## 447 16.00273 14.33357 17.67189
## 448 16.39949 14.85966 17.93931
## 453 15.58639 14.00952 17.16327